The libraries I will use:

library(data.table)
library(dplyr)
library(jpeg)
library(imager)
library(factoextra)

I read the data and apply PCA:

raw_data <- read.csv("/Users/ayberkakgun/Desktop/Musk1.csv") %>% as.data.frame()
features<-scale(raw_data[3:168])
labels<-raw_data[1:2]
colnames(labels)<-c("BagClass","BagId")
pca<-princomp(features)
str(pca)
## List of 7
##  $ sdev    : Named num [1:166] 7.19 4.8 3.55 2.9 2.86 ...
##   ..- attr(*, "names")= chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
##  $ loadings: loadings [1:166, 1:166] -0.036844 -0.06835 -0.094764 0.092619 -0.000908 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:166] "X42" "X.198" "X.109" "X.75" ...
##   .. ..$ : chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
##  $ center  : Named num [1:166] 1.60e-16 -6.24e-17 3.54e-17 1.66e-17 -6.13e-17 ...
##   ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
##  $ scale   : Named num [1:166] 1 1 1 1 1 1 1 1 1 1 ...
##   ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
##  $ n.obs   : int 475
##  $ scores  : num [1:475, 1:166] -0.787 -0.255 -1.359 -1.464 -1.439 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:166] "Comp.1" "Comp.2" "Comp.3" "Comp.4" ...
##  $ call    : language princomp(x = features)
##  - attr(*, "class")= chr "princomp"
eigenvalue<-get_eigenvalue(pca)
head(eigenvalue,15)
##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1   51.747369        31.238880                    31.23888
## Dim.2   23.049909        13.914782                    45.15366
## Dim.3   12.628826         7.623777                    52.77744
## Dim.4    8.400432         5.071178                    57.84862
## Dim.5    8.164036         4.928469                    62.77709
## Dim.6    6.755840         4.078369                    66.85545
## Dim.7    5.364907         3.238690                    70.09414
## Dim.8    5.040095         3.042607                    73.13675
## Dim.9    3.302223         1.993488                    75.13024
## Dim.10   2.849416         1.720137                    76.85038
## Dim.11   2.570613         1.551829                    78.40221
## Dim.12   2.375583         1.434093                    79.83630
## Dim.13   2.205900         1.331659                    81.16796
## Dim.14   2.107007         1.271959                    82.43992
## Dim.15   1.812080         1.093917                    83.53383

#a It takes 12 components to explain 80% of the variability and cumulative contribution gets smaller.

eig<-eigenvalue$variance.percent[eigenvalue$cumulative.variance.percent<85]
l<-length(eig)
ggplot()+
    geom_col(aes(x=1:l,y=eig))+
    geom_line(aes(x=1:l,y=eig,col="red"))

If we check the plot for two components, we see two dimensions are not enough to distinguish bag labels in space:

plot(x=pca$scores[,1],y=pca$scores[,2],col=raw_data$X1+1)
abline(h=0,v=0,lty=3)

I calculate distance matrix and perform MDS on it, without forgetting to scale the data. This plot confirms with another method, two components won’t be enough to explain variability and distinguish bag labels.

distanceMat<-dist(features,upper=T) %>% as.matrix()
mds<-cmdscale(distanceMat)
plot(mds[,1],mds[,2],
     xlab='Component 1', ylab='Component 2',
    col=raw_data$X1+1)

#b Once again perform PCA this time on aggregated data, this time 6 components are enough to explain 80% of variability. Hence the points are distributed more heterogeneously.

data_agg<-aggregate(raw_data,by=list(raw_data$X1.1),FUN=mean)
features_agg<-scale(data_agg[4:169])
pca_agg<-prcomp(features_agg)
str(pca_agg)
## List of 5
##  $ sdev    : num [1:92] 6.65 5.62 4.69 4.21 3.1 ...
##  $ rotation: num [1:166, 1:92] -0.0536 -0.0653 -0.0607 0.0848 -0.0135 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:166] "X42" "X.198" "X.109" "X.75" ...
##   .. ..$ : chr [1:92] "PC1" "PC2" "PC3" "PC4" ...
##  $ center  : Named num [1:166] -1.35e-16 -6.12e-17 -8.78e-17 -2.73e-17 -4.36e-17 ...
##   ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
##  $ scale   : Named num [1:166] 18.1 76.8 57.4 66.2 20.5 ...
##   ..- attr(*, "names")= chr [1:166] "X42" "X.198" "X.109" "X.75" ...
##  $ x       : num [1:92, 1:92] -1.63 -3.51 -4.39 -3.38 -5.25 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : NULL
##   .. ..$ : chr [1:92] "PC1" "PC2" "PC3" "PC4" ...
##  - attr(*, "class")= chr "prcomp"
eigenvalue_agg<-get_eigenvalue(pca_agg)
head(eigenvalue_agg,10)
##        eigenvalue variance.percent cumulative.variance.percent
## Dim.1   44.241806        26.651690                    26.65169
## Dim.2   31.636561        19.058170                    45.70986
## Dim.3   21.961633        13.229899                    58.93976
## Dim.4   17.709779        10.668542                    69.60830
## Dim.5    9.609901         5.789097                    75.39740
## Dim.6    7.555042         4.551230                    79.94863
## Dim.7    5.042804         3.037834                    82.98646
## Dim.8    3.553406         2.140606                    85.12707
## Dim.9    3.268410         1.968922                    87.09599
## Dim.10   2.567582         1.546736                    88.64273
plot(x=pca_agg$x[,1],y=pca_agg$x[,2],
     xlab="Component 1",
     ylab="Component 2",
     col=data_agg$X1+1)

Same MDS operation on aggregated data, results in a similar plot with PCA. Most of the red points stay in a triangular area of the space.

distanceMat_agg<-dist(features_agg,upper=T) %>% as.matrix()
mds_agg<-cmdscale(distanceMat_agg)
plot(mds_agg[,1],mds_agg[,2],
     # main='Location',
     xlab='', ylab='',
     col=data_agg$X1+1)

#Task 2

I read my image, and plot it.

img <- readJPEG("/Users/ayberkakgun/Desktop/profil2.jpg")
plot(c(0, 256), c(0, 256), type = "n", xlab = "", ylab = "")
rasterImage(img, 0, 0, 256, 256)

I split the channels and plot them seperately.

r<-img[1:256,1:256,1]
g<-img[1:256,1:256,2]
b<-img[1:256,1:256,3]
red_palette <- colorRampPalette(c("black","red"))
blue_palette <- colorRampPalette(c("black","blue"))
green_palette <- colorRampPalette(c("black","green"))

image(r,col=red_palette(256),main="red",axes = F)

image(g,col=green_palette(256),main="green",axes=F)

image(b,col=blue_palette(256),main="blue",axes=F)

I add noise to my image by random distribution, control that with noise numbers don’t exceed 1 and display the image with noise.

minx<-min(img)
maxx<-max(img)*0.1
noise<-matrix(runif(65536,min=minx,max=maxx),256)
r2<-r+noise
r2[r2>1]=1
g2<-g+noise
g2[g2>1]=1
b2<-b+noise
b2[b2>1]=1
# img2 <- rgb(r2, g2, b2)
img2<-img
img2[,,1]<-r2
img2[,,2]<-g2
img2[,,3]<-b2
plot(c(0, 256), c(0, 256), type = "n", xlab = "", ylab = "")
rasterImage(img2, 0, 0, 256, 256)

I construct the patches and perform PCA on them.

imgr <- load.image("/Users/ayberkakgun/Desktop/profil2.jpg")
imgrr<-grayscale(imgr)
vectors <-matrix(nrow=53824,ncol = 625)
k<-1
for (i in 1:232){
    for(j in 1:232){
        vectors[k,]<-imgrr[i:(i+24),j:(j+24)] %>% as.vector()
        k<-k+1
    }
    }
pca_img<-princomp(vectors)
eigenvalue_img<-get_eigenvalue(pca_img)
head(eigenvalue_img,5)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1 31.1588597        68.497106                    68.49711
## Dim.2  4.3178490         9.492008                    77.98911
## Dim.3  1.8995843         4.175892                    82.16501
## Dim.4  0.9781981         2.150391                    84.31540
## Dim.5  0.6798462         1.494519                    85.80992
plot(eigenvalue_img$cumulative.variance.percent)

To plot the image, I first scale them between 0 and 1 and then construct images for first, second and third components.They give somewhat a good indication about image, better to worse when we go from first to third component; as expected.

score1 <- pca_img$scores[,1]
score2 <- pca_img$scores[,2]
score3 <- pca_img$scores[,3]

score1_scaled <- (score1 - min(score1)) / (max(score1)-min(score1))
score2_scaled <- (score2 - min(score2)) / (max(score2)-min(score2))
score3_scaled<- (score3 - min(score3)) / (max(score3)-min(score3))

imgp1 <- matrix(score1_scaled, 232, byrow=TRUE)
imgp2 <- matrix(score2_scaled, 232, byrow=TRUE)
imgp3 <- matrix(score3_scaled, 232, byrow=TRUE)

image(imgp1,col=gray.colors(256),main="",axes = F)

image(imgp2,col=gray.colors(256),main="",axes = F)

image(imgp3,col=gray.colors(256),main="",axes = F)

Same procedure for eigenvalues, which sure gives worst representation of the image; only an idea about where the object in the image is concentrated.

eig1<-pca_img$loadings[,1]
eig2<-pca_img$loadings[,2]
eig3<-pca_img$loadings[,3]

eig1_scaled <- (eig1 - min(eig1)) / (max(eig1)-min(eig1))
eig2_scaled <- (eig2 - min(eig2)) / (max(eig2)-min(eig2))
eig3_scaled<- (eig3 - min(eig3)) / (max(eig3)-min(eig3))

imge1 <- matrix(eig1_scaled, 25, byrow=TRUE)
imge2 <- matrix(eig2_scaled, 25, byrow=TRUE)
imge3 <- matrix(eig3_scaled, 25, byrow=TRUE)

image(imge1,col=gray.colors(256),main="",axes = F)

image(imge2,col=gray.colors(256),main="",axes = F)

image(imge3,col=gray.colors(256),main="",axes = F)